In [2]:
from __future__ import division
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import requests
from sklearn.feature_extraction.text import CountVectorizer
import math
from bs4 import BeautifulSoup
import requests
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
import PyPDF2

In [3]:
pdfFileObj = open('oldmansea.pdf','rb')     #'rb' for read binary mode
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pages = pdfReader.numPages
print(type(pages))
speechtext = []
print(type(speechtext))
i = 1
while i < pages: 
    pageObj = pdfReader.getPage(i)
    text= pageObj.extractText()
    text = [text]
    #text = ''.join(row.findAll(text=True)) 
    #data = [str(text.strip())] 
    speechtext = speechtext + text
    i = i+1

#print(speechtext[14])
book_df = pd.DataFrame(speechtext)
print(book_df.shape)
print(type(book_df))
print (book_df.columns)

cv = CountVectorizer(binary=False, stop_words = 'english', min_df=.025, max_df =.60) 
cv_dm = cv.fit_transform(book_df[0])
print(cv_dm.shape)
names = cv.get_feature_names()   #create list of feature names
count = np.sum(cv_dm.toarray(), axis = 0) # add up feature counts 
count2 = count.tolist()  # convert numpy array to list
count_df = pd.DataFrame(count2, index = names, columns = ['count']) # create a dataframe from the list
#count_df.sort_values(['count'], ascending = False)[0:19]


<class 'int'>
<class 'list'>
(51, 1)
<class 'pandas.core.frame.DataFrame'>
RangeIndex(start=0, stop=1, step=1)
(51, 1230)

In [4]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer() 

def stem_text(row):
    text = str(row).split() 
    stemtext = [ps.stem(word) for word in text]
    stem2text = ' '.join(stemtext)
    return stem2text
book_df['orig']=book_df[0]
book_df['stemmed'] = book_df[0].apply(lambda x: stem_text(x))
#print( book_df[0:1])
#print("~~~~~~~~~~~~~~~~~~~")
#print(book_df.stemmed[0:1])
#print(book_df.stemmed[0])

In [5]:
cv_dm = cv.fit_transform(book_df['stemmed'])
print(cv_dm.shape)
names = cv.get_feature_names()   
count = np.sum(cv_dm.toarray(), axis = 0) 
count2 = count.tolist()  
count_df = pd.DataFrame(count2, index = names, columns = ['count']) 
#count_df.sort_values(['count'], ascending = False)[0:19]


(51, 1180)

In [6]:
book_df['newtext'] = book_df.orig.apply(lambda x: x.replace('TM', ''))
#for every x in newtext, apply the replace statement to every x in newtext
#print(book_df.newtext[0])

T1.

Read in or create a data frame with at least one column of text to be analyzed. This could be the text you used previously or new text. Based on the context of your dataset and the question you want to answer, identify at least terms that would be beneficial to replace and place them into a [Python] dictionary. Use your discretion concerning what terms make the most sense to replace. Apply the replacement. Compare the feature space before and after your replacement.


In [7]:
import re
_dict = { 'boat':'boats', 'old man':'om', 'Asianing.com':'','asiaing':'', 'fishes':'fish', '\n':''}

def multiple_replace(dict, text): 

  """ Replace in 'text' all occurences of any key in the given
  dictionary by its corresponding value.  Returns the new tring.""" 
  text = str(text).lower()

  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)

book_df['cleantext'] = book_df.newtext.apply(lambda x: multiple_replace(_dict, x))
#book_df.cleantext[0]

In [8]:
cv_dm_clean = cv.fit_transform(book_df['cleantext'])
print(cv_dm_clean.shape)
names = cv.get_feature_names()   
count = np.sum(cv_dm_clean.toarray(), axis = 0)
count2 = count.tolist() 
count_df = pd.DataFrame(count2, index = names, columns = ['count']) 
count_df.sort_values(['count'], ascending = False)[0:19]


(51, 1116)
Out[8]:
count
boy 99
hand 90
great 64
head 62
did 57
come 52
sun 50
left 50
skiff 48
shark 46
saw 46
like 46
knew 45
hands 44
eat 44
good 44
think 43
dark 42
came 41

Q1.

Write a short description of the context of the dataset in your own words. Specifically, identify what is unique about the dataset or observations in the dataset. Make sure your answer is no longer than four paragraphs, and should at minimum answer these questions: Why did you choose the terms for your replacement dictionary? Give several specific example and explain why you chose to replace each one of them. What other replacements would you consider making, if any? Why or why not? What is the effect of the replacement on your feature space? Does this make sense? Is it helpful for answering your question? Why or why not? Audience: technical – fellow data scientists or other technical staff.

A1.

Write a short description of the context of the dataset in your own words. Specifically, identify what is unique about the dataset or observations in the dataset. The data set comes from the novel "The Old Man and Sea" by Ernest Hemingway. The corpus is made up of several documents. Each document corresponds to an individual page from the novel. The unique item about this corpus is that it comes from a well read and well regarded literary work. It is well known and has been critiqued throughout the years. Using this book for text mining and sentiment analysis is a good choice because there is already a significant amount of research behind the novel. Doing sentiment analysis on this book will provide another avenue of research and we will be able to assess our analysis against the established literary criticism and reviews.

Why did you choose the terms for your replacement dictionary? The original document came from an online source. The PDF had the url scattered throughout the document. I needed to remove that from the corpus (I replaced it with an empty string) because the url provided no value to me. Due to it being a PDF there were many new line characters. I added an entry in the dictionary to remove the new line symbol with an empty character as well.

Give several specific example and explain why you chose to replace each one of them. What other replacements would you consider making, if any? Why or why not? The term "old man" appears throughout the book so I replaced those two words with an abbreviation of "om". Following the reasoning in class, I replaced "boat" with "boats" and "fishes" with "fish". These tokens can be grouped together because they provide the same information for our needs. Grouping them together will allow us to reduce the feature space while still maintaining the right amount of information.

What is the effect of the replacement on your feature space? Does this make sense? Is it helpful for answering your question? Why or why not? As anticipated, the feature space is reduced by about 80 tokens. This is expected because we removed some tokens completely and for others we grouped them together. This helps us because it is more manageable and we have not lost any important information. For other types of replacements I would try to group character names and nicknames togeter. For example, if the main character is address by his first name, full name, and Mr. Lastname then I would create a dictionary entry to map those tokens to a single token.

T2. 

Create a sentiment dictionary from one of the sources in class or find/create your own (potential bonus points for appropriate creativity). Using your dictionary, create sentiment labels for the text entries in your corpus.


In [39]:
afinn = {}
for line in open("C:/Users/dmdal/OneDrive/Documents/TextMining/Assignment 4/AFINN-111.txt"):
    tt = line.split('\t')
    afinn.update({tt[0]:int(tt[1])})


#print(type(afinn), len(afinn))

for key, value in sorted(afinn.items())[0:10]:
    print(key + " => " + str(value))
print("~~~~~~~~~~~~")
for key, value in sorted(afinn.items())[2467:]:
    print(key + " => " + str(value))
    
def afinn_sent(inputstring):
    
    sentcount =0
    for word in inputstring.split():  
        if word in afinn:
            sentcount = sentcount + afinn[word]
            
    
    if (sentcount < -3):
        sentiment = 'Negative'
    elif (sentcount >3):
        sentiment = 'Positive'
    else:
        sentiment = 'Neutral'
    
    return sentiment

book_df['afinn'] = book_df.cleantext.apply(lambda x: afinn_sent(x))
#print(book_df.iloc[0:10][['cleantext','afinn']])
book_df.iloc[25:30][['cleantext','afinn']]
#print(book_df.cleantext[25])
#print(book_df.afinn[25])
book_df.afinn.value_counts()


abandon => -2
abandoned => -2
abandons => -2
abducted => -2
abduction => -2
abductions => -2
abhor => -3
abhorred => -3
abhorrent => -3
abhors => -3
~~~~~~~~~~~~
yeah => 1
yearning => 1
yeees => 2
yes => 1
youthful => 2
yucky => -2
yummy => 3
zealot => -2
zealots => -2
zealous => 2
Out[39]:
Positive    29
Neutral     16
Negative     6
Name: afinn, dtype: int64

Q2.

Write a short description of how the sentiment analysis was done and what the outcome is. Make sure your answer is no longer than three paragraphs, and should at minimum answer these questions: How is your dictionary structured? How will this work for your dataset? What measure did you use to determine the sentiment label? Why? Do any of the label assignments surprise you? Include a few specific examples of label assignment and how it was determined and why it does or does not make sense. Audience: general – management or non-technical staff.

A2.

How is your dictionary structured? How will this work for your dataset? What measure did you use to determine the sentiment label? The dictionary used for the sentiment analysis consisted of a variety of words. Each word is associated with a value. The higher the value the more postive the word is. The more negative a number then it would signal the word is negative. A value of zero means the word is neutral and is neither "good" or "bad". Each page of the book is compared against the sentiment dictionary and a tally is kept. In general, if there are more postives point for the page then the page is labeled positive, if it is zero then it is neutral, and if it is negative then the page is labeled negative. However, for my sentiment analysis, neutral is between -3 and 3. I used this range because I wanted to determine pages that had a significant amount more positive and negative.

Do any of the label assignments surprise you? After reviewing sentiment list at a high level, not many of the items surprise me. The biggest issue that would occur is when these words appear in context of another word. For example, "conflict" is a negative word. However, if the complete phrase was "conflict resolution" then that would be a more positive sentiment. Over all, the sentiment makes sense and helps us assess the tone of the novel.

Page [20] is labeled as positive. However, after reading this page it seems more neutral or even negative based on the context of the passage. Based on the context, tone, and total imagery presented in page 25 is positive and is labeled accurately.

T3.

Consider a specific outcome you would like to achieve with your sentiment analysis. That is, determine what sentiment you might want to have assigned to a specific piece of text. It could be one entry in your corpus, several documents, or the entire corpus. Make changes to the feature space and/or dictionary to achieve that outcome. Show specific results.


In [40]:
afinn = {}
for line in open("C:/Users/dmdal/OneDrive/Documents/TextMining/Assignment 4/AFINN-111.txt"):
    tt = line.split('\t')
    afinn.update({tt[0]:int(tt[1])})


#print(type(afinn), len(afinn))

for key, value in sorted(afinn.items())[0:10]:
    print(key + " => " + str(value))
print("~~~~~~~~~~~~")
for key, value in sorted(afinn.items())[2467:]:
    print(key + " => " + str(value))
    
def afinn_sent(inputstring):
    
    sentcount =0
    for word in inputstring.split():  
        if word in afinn:
            sentcount = sentcount + afinn[word]
            
    
    if (sentcount < -3):
        sentiment = 'Negative'
    elif (sentcount >7):
        sentiment = 'Positive'
    else:
        sentiment = 'Neutral'
    
    return sentiment

book_df['afinn'] = book_df.cleantext.apply(lambda x: afinn_sent(x))
#print(book_df.iloc[0:10][['cleantext','afinn']])
book_df.iloc[25:30][['cleantext','afinn']]
#print(book_df.cleantext[25])
#print(book_df.afinn[25])

book_df.afinn.value_counts()


abandon => -2
abandoned => -2
abandons => -2
abducted => -2
abduction => -2
abductions => -2
abhor => -3
abhorred => -3
abhorrent => -3
abhors => -3
~~~~~~~~~~~~
yeah => 1
yearning => 1
yeees => 2
yes => 1
youthful => 2
yucky => -2
yummy => 3
zealot => -2
zealots => -2
zealous => 2
Out[40]:
Neutral     25
Positive    20
Negative     6
Name: afinn, dtype: int64

Q3.

Write a short description of the exercise and the outcome. Make sure your answer is no longer than three paragraphs, and should at minimum answer these questions: What outcome did you choose? Why? How did you change the dictionary to achieve that outcome? How would you explain (justify, rationalize) those changes if necessary? Audience: general – management or non-technical staff.

A3.

I felt that there were many pages that were positive that after being read seemed more neutral. The events happening on the page were just events, not positive or negative. To combat this I changed the requirements for a page to be positive. The results match the expected outcomes. By making it more difficult for a page to be labeled as positive we see a portion of pages have moved from the positive label to neutral.

Originally the count of positives were significantly higher than the rest. I would expect literature to be neutral since it is telling a story. By adjusting the requirements for a positive result I am able to balance postive and neutral pages more evenly. This can help us identify more postive events in the story.

Q4.

Data science is all about finding patterns in the data. You have just been asked to decide on a pattern before finding it. Write a short description of how the easy or difficult it was to arrive at a predetermined conclusion. How difficult was it to justify? What are the ethical issues involved, if any? What is your role as a data scientist? Audience: general – management or non-technical staff.

A4.

I think it is pretty easy to arrive at a predetermined conclusion and there are several examples to support this. One immediate example is confirmation bias. We typically find information that supports our preconceived ideas of the world or model. If we find information that refutes our understanding we are more inclined to critique it and find flaws in the logic. This concern of predetermined conclusions isn't just unique to data science but exists in all sorts of fields. The joke about accountants "well, what do you want the numbers to equal" comes to mind.

In terms of data science specifically, we see this issue frequently, as pointed out in the book "Weapons of Math Destruction". We build a model based on some assumptions and the model turns into a positive feedback loop. A scenario I've been working through recently consists of policing. Let's say we decide to put more police officers in a lower income area because we believe that typically those areas have a higher crimes. If there are more officers in that area then it would most likely end up correlating to a higher amount of arrests. The higher arrests then "prove" that lower income areas have more crime so we then allocate more officers to that area and the cycle continues. Depending on what the model is to be used for, the ethical implications can range wildly.

Your role as a data scientist, as a scientist in general, is to try to remain as objective as possible and let the data speak for itself. When new models or theories come out it is important to evaluate them for what they are and not get caught up in new trends for the sake of it being something new. Likewise, it is important to remain a constant learner so you do not get stuck in outdated modes of thinking.